confusion <- function(yhat, y, quietly = FALSE){
if(!quietly) message("yhat is the vector of predicted outcomes, possibly a factor.
  \n Sensitivity = (first level predicted) / (first level actual) 
  \n Specificity = (second level predicted) / (second level actual)")
if(!is.factor(y) & is.factor(yhat))
  y <- as.factor(y)
if(!all.equal(levels(yhat), levels(y))) 
  stop("Factor levels of yhat and y do not match.")
confusion_mat <- table(yhat, y, deparse.level = 2)
stats <- data.frame(sensitivity = confusion_mat[1, 1]/sum(confusion_mat[, 1]), specificity 
                    = confusion_mat[2, 2]/sum(confusion_mat[, 2]))
return(list(confusion_mat = confusion_mat, stats = stats))
}
fifadataraw <- read_csv("data.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_double(),
##   ID = col_double(),
##   Age = col_double(),
##   Overall = col_double(),
##   Potential = col_double(),
##   Special = col_double(),
##   `International Reputation` = col_double(),
##   `Weak Foot` = col_double(),
##   `Skill Moves` = col_double(),
##   `Jersey Number` = col_double(),
##   Crossing = col_double(),
##   Finishing = col_double(),
##   HeadingAccuracy = col_double(),
##   ShortPassing = col_double(),
##   Volleys = col_double(),
##   Dribbling = col_double(),
##   Curve = col_double(),
##   FKAccuracy = col_double(),
##   LongPassing = col_double(),
##   BallControl = col_double()
##   # ... with 24 more columns
## )
## See spec(...) for full column specifications.
fifadataraw1 <- fifadataraw

fifadataraw1 <- fifadataraw1 %>% mutate(League = ifelse(Club == "FC Bayern München" | Club == "Borussia Dortmund" | Club == "RB Leipzig" | Club == "Bayer 04 Leverkusen" | Club == "Borussia Mönchengladbach" | Club == "VfL Wolfsburg" | Club == "Eintracht Frankfurt" | Club == "SV Werd Bremen" | Club == "TSG 1899 Hoffenheim" | Club == "Fortuna Düsseldorf" | Club == "Hertha BSC" | Club == "1. FSV Mainz 05" | Club == "SC Freiburg" | Club == "FC Schalke 04" | Club == "FC Augsburg" | Club == "VfB Stuttgart" | Club == "Hannover 96" | Club == "FC Nürnberg", "Bundesliga", ifelse(Club == "FC Barcelona" | Club == "Real Madrid" | Club == "Atlético Madrid" | Club == "Valencia CF" | Club == "Getafe CF" | Club == "Sevilla FC" | Club == "RCD Espanyol" | Club == "Athletic Club de Bilbao" | Club == "Real Sociedad" | Club == "Real Betis" | Club == "Deportivo Alavés" | Club == "SD Eibar" | Club == "CD Leganés" | Club == "Villarreal CF" | Club == "Levante UD" | Club == "Real Valladolid CF" | Club == "RC Celta" | Club == "Girona FC" | Club == "SD Huesca" | Club == "Rayo Vallecano", "La Liga", ifelse(Club == "Manchester City" | Club == "Liverpool" | Club == "Chelsea" | Club == "Tottenham Hotspur"| Club == "Arsenal" | Club == "Manchester United" | Club == "Wolverhampton Wanderers" | Club == "Everton" | Club == "Leicester City" | Club == "West Ham United" | Club == "Watford" | Club == "Crystal Palace" | Club == "Newcastle United" | Club == "Bournemouth" | Club == "Burnley" | Club == "Southampton" | Club == "Brighton & Hove Albion" | Club == "Cardiff City" | Club == "Fulham" | Club == "Huddersfield Town", "Premier League", ifelse(Club == "Juventus" | Club == "Napoli" | Club == "Atalanta" | Club == "Inter" | Club == "Milan" | Club == "Roma" | Club == "Torino" | Club == "Lazio" | Club == "Sampdoria" | Club == "Bologna" | Club == "Sassuolo" | Club == "Udinese" | Club == "SPAL" | Club == "Parma" | Club == "Cagliari" | Club == "Fiorentina" | Club == "Genoa" | Club == "Empoli" | Club == "Frosinone" | Club == "Chievo Verona", "Serie A", ifelse(Club == "Paris Saint-Germain" | Club == "LOSC Lille" | Club == "Olympique Lyonnais" | Club == "AS Saint-Étienne" | Club == "Olympique de Marseille" | Club == "Montpellier HSC" | Club == "OGC Nice" | Club == "Stade de Reims" | Club == "Nîmes Olympique" | Club == "Stade Rennais FC" | Club == "RC Strasbourg Alsace" | Club == "FC Nantes" | Club == "Angers SCO" | Club == "FC Girondins de Bordeaux" | Club == "Amiens SC" | Club == "Toulouse Football Club" | Club == "AS Monaco" | Club == "Dijon FCO" | Club == "Stade Malherbe Caen" | Club == "En Avant de Guingamp", "Ligue 1", "NA"))))))

fifa_data2 <- fifadataraw1 %>% dplyr::select(-c(ID, Flag, Photo, 11, 29:54)) %>%
  mutate(ValueMultiplier = ifelse(str_detect(Value, "K"), 1000, ifelse(str_detect(Value, "M"), 1000000, 1))) %>%
  mutate(Value = as.numeric(str_extract(Value, "[[:digit:]]+\\.*[[:digit:]]*")) * ValueMultiplier) %>%
  mutate(Position = ifelse(is.na(Position), "Unknown", Position))

fifa_data2 <- fifa_data2 %>%
  mutate(WageMultiplier = ifelse(str_detect(Wage, "K"), 1000, ifelse(str_detect(Wage, "M"), 1000000, 1))) %>%
  mutate(Wage = as.numeric(str_extract(Wage, "[[:digit:]]+\\.*[[:digit:]]*")) * WageMultiplier)

temp1 <- sapply(fifa_data2$Weight, parse_number)
fifa_data2$Weight <- as.numeric(temp1)

temp2 <- strsplit(fifa_data2$Height, "'")
for (i in 1:length(temp2)) {
  temp2[[i]] <- as.numeric(temp2[[i]])
}
for (i in 1:length(temp2)) {
  temp2[[i]] <- (temp2[[i]][1] *12) + temp2[[i]][2]
}

temp3 <- as.numeric(unlist(temp2))

fifa_data2$Height <- temp3

colleague <- c(4, 10:22, 59, 61, 62)

fifa_data3 <- fifa_data2[,-colleague]

fifa_data4 <- fifa_data3[!is.na(fifa_data3$League),]

set.seed(1234)
cv1 <- fold(fifa_data4, k = 5, cat_col = 'League', id_col = 'X1')
cv1 <- cv1 %>%
  rename(fold = .folds)
testleague <- subset(cv1, fold == 5)
trainleague <- anti_join(cv1, testleague)
## Joining, by = c("X1", "Name", "Age", "Overall", "Potential", "Club", "Value", "Wage", "Height", "Weight", "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes", "League", "fold")
fifa_data <- fifadataraw %>% dplyr::select(-c(ID, Flag, Photo, 11, 29:54)) %>%
  mutate(ValueMultiplier = ifelse(str_detect(Value, "K"), 1000, ifelse(str_detect(Value, "M"), 1000000, 1))) %>%
  mutate(Value = as.numeric(str_extract(Value, "[[:digit:]]+\\.*[[:digit:]]*")) * ValueMultiplier) %>%
  mutate(Position = ifelse(is.na(Position), "Unknown", Position))
fifa_data <- fifa_data %>%
  mutate(WageMultiplier = ifelse(str_detect(Wage, "K"), 1000, ifelse(str_detect(Wage, "M"), 1000000, 1))) %>%
  mutate(Wage = as.numeric(str_extract(Wage, "[[:digit:]]+\\.*[[:digit:]]*")) * WageMultiplier)

off <- c('ST', 'CF', 'LF', 'LS', 'LW', 'RF', 'RS', 'RW', 'CAM', 'LAM', 'RAM')

def <- c('CB', 'LB', 'LCB', 'LWB', 'RB', 'RCB', 'RWB', 'LDM', 'CDM', 'RDM')

mid <- c('CM', 'LCM', 'RCM', 'RM', 'LM')

gk <- c('GK')

fifa_data <- fifa_data  %>% mutate(pgroup = ifelse(Position %in% gk, "GK", ifelse(Position %in% off, "OFF", ifelse(Position %in% mid, "MID", ifelse(Position %in% def, "DEF", "NA")))))

##AGE

colpca <- c(1, 3, 5, 6, 25:58, 62)
fifa_data_pca <- na.omit(fifa_data[colpca])

pca_fifa <- prcomp(fifa_data_pca %>% dplyr::select(-X1, -Age, -pgroup), scale = TRUE)

screeplot(pca_fifa)

pca_scores <- pca_fifa$x
ldr3 <- pca_scores %>%
  data.frame() %>%
  mutate(Age = fifa_data_pca$Age, X1 = fifa_data_pca$X1, pgroup = fifa_data_pca$pgroup) %>%
  dplyr::select(Age, pgroup, everything())



##VALUE
colpca <- c(1, 8, 5, 6, 25:58, 62)
fifa_data_pca <- na.omit(fifa_data[colpca])

pca_fifa <- prcomp(fifa_data_pca %>% dplyr::select(-X1, -Value, -pgroup), scale = TRUE)

screeplot(pca_fifa)

pca_scores <- pca_fifa$x
ldr8 <- pca_scores %>%
  data.frame() %>%
  mutate(Value = fifa_data_pca$Value, X1 = fifa_data_pca$X1, pgroup = fifa_data_pca$pgroup) %>%
  dplyr::select(Value, pgroup, everything())



##WAGE
colpca <- c(1, 9, 5, 6, 25:58, 62)
fifa_data_pca <- na.omit(fifa_data[colpca])

pca_fifa <- prcomp(fifa_data_pca %>% dplyr::select(-X1, -Wage, -pgroup), scale = TRUE)

screeplot(pca_fifa)

pca_scores <- pca_fifa$x
ldr9 <- pca_scores %>%
  data.frame() %>%
  mutate(Wage = fifa_data_pca$Wage, X1 = fifa_data_pca$X1, pgroup = fifa_data_pca$pgroup) %>%
  dplyr::select(Wage,pgroup, everything())



#SPECIAL
colpca <- c(1, 10, 5, 6, 25:58,62)
fifa_data_pca <- na.omit(fifa_data[colpca])

pca_fifa <- prcomp(fifa_data_pca %>% dplyr::select(-X1, -Special, -pgroup), scale = TRUE)

screeplot(pca_fifa)

pca_scores <- pca_fifa$x
ldr10 <- pca_scores %>%
  data.frame() %>%
  mutate(Special = fifa_data_pca$Special, X1 = fifa_data_pca$X1, pgroup = fifa_data_pca$pgroup) %>%
  dplyr::select(Special, pgroup, everything())
ldr3.1 <- ldr3[-which(ldr3$pgroup == "NA"),]

colors <- c("#999999", "#E69F00", "#56B4E9", "#FF0000")
colors <- colors[as.factor(ldr3.1$pgroup)]
s3d <- scatterplot3d(ldr3.1[,c(3,4,5)], pch = 1,  color=colors)

#legend("left", legend = levels(ldr3.1$pgroup), col = colors, pch = 16)

colors <- c("#999999", "#E69F00", "#56B4E9", "#000000", "#FF0000")
colors <- colors[as.factor(ldr8$pgroup)]
s4d <- scatterplot3d(ldr8[,c(3,4,1)], pch = 1,  color=colors)
legend("left", legend = levels(as.factor(ldr8$pgroup)), col = colors, pch = 16)

colors <- c("#999999", "#E69F00", "#56B4E9", "#000000", "#FF0000")
colors <- colors[as.factor(ldr9$pgroup)]
scatterplot3d(ldr9[,c(3,4,1)], pch = 1,  color=colors)
legend("left", legend = levels(as.factor(ldr9$pgroup)),
      col = colors, pch = 16)

colors <- c("#999999", "#E69F00", "#56B4E9", "#000000", "#FF0000")
colors <- colors[as.factor(ldr10$pgroup)]
scatterplot3d(ldr10[,c(3,4,1)], pch = 1,  color=colors)
legend("left", legend = levels(as.factor(ldr10$pgroup)),
      col = colors, pch = 16)

ggplot(ldr3.1, aes(x = PC1, y = PC2, color = ldr3.1$pgroup)) +geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +geom_text(aes(label = X1), size = 2) +scale_x_continuous(breaks = -10:10) +coord_cartesian(xlim = c(-10, 15)) +theme_light()

fifa_data <- fifa_data %>% mutate(AgeGroup = ifelse(Age <= 22, "young", ifelse(Age > 22 & Age <= 27, "middle", ifelse(Age >= 28, "old", "NA"))))
fifa_data_pagegroup <- transform(fifa_data, pagegroup = paste(pgroup,AgeGroup))
head(fifa_data_pagegroup)
##   X1              Name Age Nationality Overall Potential
## 1  0          L. Messi  31   Argentina      94        94
## 2  1 Cristiano Ronaldo  33    Portugal      94        94
## 3  2         Neymar Jr  26      Brazil      92        93
## 4  3            De Gea  27       Spain      91        93
## 5  4      K. De Bruyne  27     Belgium      91        92
## 6  5         E. Hazard  27     Belgium      91        91
##                  Club     Value   Wage Special Preferred.Foot
## 1        FC Barcelona 110500000 565000    2202           Left
## 2            Juventus  77000000 405000    2228          Right
## 3 Paris Saint-Germain 118500000 290000    2143          Right
## 4   Manchester United  72000000 260000    1471          Right
## 5     Manchester City 102000000 355000    2281          Right
## 6             Chelsea  93000000 340000    2142          Right
##   International.Reputation Weak.Foot Skill.Moves      Work.Rate  Body.Type
## 1                        5         4           4 Medium/ Medium      Messi
## 2                        5         4           5      High/ Low C. Ronaldo
## 3                        5         5           5   High/ Medium     Neymar
## 4                        4         3           1 Medium/ Medium       Lean
## 5                        4         5           4     High/ High     Normal
## 6                        4         4           4   High/ Medium     Normal
##   Real.Face Position Jersey.Number       Joined Loaned.From
## 1       Yes       RF            10  Jul 1, 2004        <NA>
## 2       Yes       ST             7 Jul 10, 2018        <NA>
## 3       Yes       LW            10  Aug 3, 2017        <NA>
## 4       Yes       GK             1  Jul 1, 2011        <NA>
## 5       Yes      RCM             7 Aug 30, 2015        <NA>
## 6       Yes       LF            10  Jul 1, 2012        <NA>
##   Contract.Valid.Until Height Weight Crossing Finishing HeadingAccuracy
## 1                 2021    5'7 159lbs       84        95              70
## 2                 2022    6'2 183lbs       84        94              89
## 3                 2022    5'9 150lbs       79        87              62
## 4                 2020    6'4 168lbs       17        13              21
## 5                 2023   5'11 154lbs       93        82              55
## 6                 2020    5'8 163lbs       81        84              61
##   ShortPassing Volleys Dribbling Curve FKAccuracy LongPassing BallControl
## 1           90      86        97    93         94          87          96
## 2           81      87        88    81         76          77          94
## 3           84      84        96    88         87          78          95
## 4           50      13        18    21         19          51          42
## 5           92      82        86    85         83          91          91
## 6           89      80        95    83         79          83          94
##   Acceleration SprintSpeed Agility Reactions Balance ShotPower Jumping
## 1           91          86      91        95      95        85      68
## 2           89          91      87        96      70        95      95
## 3           94          90      96        94      84        80      61
## 4           57          58      60        90      43        31      67
## 5           78          76      79        91      77        91      63
## 6           94          88      95        90      94        82      56
##   Stamina Strength LongShots Aggression Interceptions Positioning Vision
## 1      72       59        94         48            22          94     94
## 2      88       79        93         63            29          95     82
## 3      81       49        82         56            36          89     87
## 4      43       64        12         38            30          12     68
## 5      90       75        91         76            61          87     94
## 6      83       66        80         54            41          87     89
##   Penalties Composure Marking StandingTackle SlidingTackle GKDiving
## 1        75        96      33             28            26        6
## 2        85        95      28             31            23        7
## 3        81        94      27             24            33        9
## 4        40        68      15             21            13       90
## 5        79        88      68             58            51       15
## 6        86        91      34             27            22       11
##   GKHandling GKKicking GKPositioning GKReflexes Release.Clause
## 1         11        15            14          8        €226.5M
## 2         11        15            14         11        €127.1M
## 3          9        15            15         11        €228.1M
## 4         85        87            88         94        €138.6M
## 5         13         5            10         13        €196.4M
## 6         12         6             8          8        €172.1M
##   ValueMultiplier WageMultiplier pgroup AgeGroup  pagegroup
## 1           1e+06           1000    OFF      old    OFF old
## 2           1e+06           1000    OFF      old    OFF old
## 3           1e+06           1000    OFF   middle OFF middle
## 4           1e+06           1000     GK   middle  GK middle
## 5           1e+06           1000    MID   middle MID middle
## 6           1e+06           1000    OFF   middle OFF middle
extract <- function(x){
  regexp <- "[[:digit:]]+"
  str_extract(x, regexp)
}
temp1 <- sapply(fifa_data_pagegroup$Weight, extract)
fifa_data_pagegroup$Weight <- as.numeric(temp1)

temp2 <- strsplit(fifa_data_pagegroup$Height, "'")
for (i in 1:length(temp2)) {
  temp2[[i]] <- as.numeric(temp2[[i]])
}
for (i in 1:length(temp2)) {
  temp2[[i]] <- (temp2[[i]][1] *12) + temp2[[i]][2]
}

temp3 <- as.numeric(unlist(temp2))

fifa_data_pagegroup$Height <- temp3

colors2 <- c("palegreen1", "lightskyblue1", "honeydew2", "cyan3", "orchid", "grey55", "papayawhip", "mediumturquoise", "darkslategrey", "salmon2", "gold1", "tomato3")
colpca <- c(1, 8, 5, 6, 25:58, 64)
fifa_data_pca2 <- na.omit(fifa_data_pagegroup[colpca])

pca_fifa2 <- prcomp(fifa_data_pca2 %>% dplyr::select(-X1, -Value, -pagegroup), scale = TRUE)

screeplot(pca_fifa2)

pca_scores2 <- pca_fifa2$x
ldr8_2 <- pca_scores2 %>%
  data.frame() %>%
  mutate(Value = fifa_data_pca2$Value, X1 = fifa_data_pca2$X1, pagegroup = fifa_data_pca2$pagegroup) %>%
  dplyr::select(Value, pagegroup, everything())

colors2 <- colors2[as.factor(ldr8_2$pagegroup)]
s4d <- scatterplot3d(ldr8[,c(3,4,1)], pch = 1,  color=colors2)
legend("right", legend = levels(as.factor(ldr8_2$pagegroup)),
      col = colors2, pch = 16)

ggplot(ldr8_2, aes(x = PC1, y = PC2, color = ldr8_2$pagegroup)) +geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +geom_text(aes(label = X1), size = 2) +scale_x_continuous(breaks = -10:10) +coord_cartesian(xlim = c(-10, 15)) +theme_light()

fifa_data_pca2 <- fifa_data_pca2[-c(1,2)]

ldahist1 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "OFF young"),]
ldahist2 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "OFF middle"),]
ldahist3 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "OFF old"),]
ldahist4 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "DEF young"),]
ldahist5 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "DEF middle"),]
ldahist6 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "DEF old"),]
ldahist7 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "GK young"),]
ldahist8 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "GK middle"),]
ldahist9 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "GK old"),]
ldahist10 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "MID young"),]
ldahist11<- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "MID middle"),]
ldahist12 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "MID old"),]
 
ldahist1 %>% gather()
ldahist2 %>% gather()
ldahist3 %>% gather()
ldahist4 %>% gather()
ldahist5 %>% gather()
ldahist6 %>% gather()
ldahist7 %>% gather()
ldahist8 %>% gather()
ldahist9 %>% gather()
ldahist10 %>% gather()
ldahist11 %>% gather()
ldahist12 %>% gather()

ggplot(gather(ldahist1),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist2),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist3),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist4),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist5),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist6),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist7),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist8),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist9),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist10),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist11),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist12),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
colpca1 <- c(1, 3, 5, 6, 8, 9, 18, 23:58, 64)

fifa_data_pagegroup <- fifa_data_pagegroup[-which(fifa_data_pagegroup$pagegroup == 'NA young'),]
fifa_data_pagegroup <- fifa_data_pagegroup[-which(fifa_data_pagegroup$pagegroup == 'NA middle'),]
fifa_data_pagegroup <- fifa_data_pagegroup[-which(fifa_data_pagegroup$pagegroup == 'NA old'),]
fifa_data_pagegroup <- na.omit(fifa_data_pagegroup[colpca1])
fifa_data_pagegroup1 <- fifa_data_pagegroup

set.seed(1234)
cv <- fold(fifa_data_pagegroup1, k = 5, cat_col = 'pagegroup', id_col = 'X1')
cv <- cv %>%
  rename(fold = .folds)

test <- subset(cv, fold == 5)
train <- anti_join(cv, test)
## Joining, by = c("X1", "Age", "Overall", "Potential", "Value", "Wage", "Position", "Height", "Weight", "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes", "pagegroup", "fold")
head(fifa_data_pagegroup1)
##   X1 Age Overall Potential     Value   Wage Position Height Weight
## 1  0  31      94        94 110500000 565000       RF     67    159
## 2  1  33      94        94  77000000 405000       ST     74    183
## 3  2  26      92        93 118500000 290000       LW     69    150
## 4  3  27      91        93  72000000 260000       GK     76    168
## 5  4  27      91        92 102000000 355000      RCM     71    154
## 6  5  27      91        91  93000000 340000       LF     68    163
##   Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1       84        95              70           90      86        97    93
## 2       84        94              89           81      87        88    81
## 3       79        87              62           84      84        96    88
## 4       17        13              21           50      13        18    21
## 5       93        82              55           92      82        86    85
## 6       81        84              61           89      80        95    83
##   FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility
## 1         94          87          96           91          86      91
## 2         76          77          94           89          91      87
## 3         87          78          95           94          90      96
## 4         19          51          42           57          58      60
## 5         83          91          91           78          76      79
## 6         79          83          94           94          88      95
##   Reactions Balance ShotPower Jumping Stamina Strength LongShots
## 1        95      95        85      68      72       59        94
## 2        96      70        95      95      88       79        93
## 3        94      84        80      61      81       49        82
## 4        90      43        31      67      43       64        12
## 5        91      77        91      63      90       75        91
## 6        90      94        82      56      83       66        80
##   Aggression Interceptions Positioning Vision Penalties Composure Marking
## 1         48            22          94     94        75        96      33
## 2         63            29          95     82        85        95      28
## 3         56            36          89     87        81        94      27
## 4         38            30          12     68        40        68      15
## 5         76            61          87     94        79        88      68
## 6         54            41          87     89        86        91      34
##   StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning
## 1             28            26        6         11        15            14
## 2             31            23        7         11        15            14
## 3             24            33        9          9        15            15
## 4             21            13       90         85        87            88
## 5             58            51       15         13         5            10
## 6             27            22       11         12         6             8
##   GKReflexes  pagegroup
## 1          8    OFF old
## 2         11    OFF old
## 3         11 OFF middle
## 4         94  GK middle
## 5         13 MID middle
## 6          8 OFF middle
ldacol <- c(1, 3:6, 8:44)

trainlda <- train[ldacol]
testlda <- test[ldacol]

testlda1<- as.matrix(testlda)

mlda <- lda(pagegroup ~ ., data = trainlda[,-1]) 
## Warning in lda.default(x, grouping, ...): groups NA middle NA old NA young
## are empty
lda.pred <- predict(mlda, newdata = testlda[,-1])
lda.pred1 <- lda.pred[[1]]
lda.pred1 <- as.matrix(lda.pred1)
confusion(as.vector(lda.pred1), as.vector(testlda1[,42]), quietly = FALSE)
## yhat is the vector of predicted outcomes, possibly a factor.
##   
##  Sensitivity = (first level predicted) / (first level actual) 
##   
##  Specificity = (second level predicted) / (second level actual)
## $confusion_mat
##             y
## yhat         DEF middle DEF old DEF young GK middle GK old GK young
##   DEF middle        373      72        57         0      0        0
##   DEF old            98     365         0         0      0        0
##   DEF young          30       0       332         0      0        0
##   GK middle           0       0         0        64     24       20
##   GK old              0       0         0        37    126        0
##   GK young            0       0         0        25      2      109
##   MID middle         22      10         5         0      0        0
##   MID old            13      49         1         0      0        0
##   MID young           2       0        31         0      0        0
##   OFF middle          1       0         0         0      0        0
##   OFF old             0       0         0         0      0        0
##   OFF young           0       0         1         0      0        0
##             y
## yhat         MID middle MID old MID young OFF middle OFF old OFF young
##   DEF middle         40       5        10          3       0         0
##   DEF old            14      47         0          0       0         0
##   DEF young           5       0        28          0       0         3
##   GK middle           0       0         0          0       0         0
##   GK old              0       0         0          0       0         0
##   GK young            0       0         0          0       0         0
##   MID middle        141      25        17         62      16         8
##   MID old            29     104         0         12      36         0
##   MID young          20       0       205          9       0        67
##   OFF middle         67      14        10        159      52        31
##   OFF old            10      22         0         54     141         1
##   OFF young           5       0        63         13       0       217
## 
## $stats
##   sensitivity specificity
## 1   0.6920223   0.7358871
fifa_data_pagegroup2 <- transform(fifa_data, pagegroup = paste(pgroup,AgeGroup))

extract <- function(x){
  regexp <- "[[:digit:]]+"
  str_extract(x, regexp)
}
temp1 <- sapply(fifa_data_pagegroup2$Weight, extract)
fifa_data_pagegroup2$Weight <- as.numeric(temp1)

temp2 <- strsplit(fifa_data_pagegroup2$Height, "'")
for (i in 1:length(temp2)) {
  temp2[[i]] <- as.numeric(temp2[[i]])
}
for (i in 1:length(temp2)) {
  temp2[[i]] <- (temp2[[i]][1] *12) + temp2[[i]][2]
}

temp3 <- as.numeric(unlist(temp2))

fifa_data_pagegroup2$Height <- temp3

colpca1 <- c(1, 2, 5, 6, 8, 9, 18, 23:58, 64)

fifa_data_pagegroup2 <- fifa_data_pagegroup2[-which(fifa_data_pagegroup2$pagegroup == 'NA young'),]
fifa_data_pagegroup2 <- fifa_data_pagegroup2[-which(fifa_data_pagegroup2$pagegroup == 'NA middle'),]
fifa_data_pagegroup2 <- fifa_data_pagegroup2[-which(fifa_data_pagegroup2$pagegroup == 'NA old'),]

fifa_data_pagegroup2 <- na.omit(fifa_data_pagegroup2[colpca1])
fifa_data_pagegroup3 <- fifa_data_pagegroup2
fifa_data_pagegroup3$pagegroup <- droplevels(fifa_data_pagegroup3$pagegroup)

set.seed(1234)
cv <- fold(fifa_data_pagegroup3, k = 5, cat_col = 'pagegroup', id_col = 'X1')
cv <- cv %>%
  rename(fold = .folds)
test1 <- subset(cv, fold == 5)
train1 <- anti_join(cv, test1)
## Joining, by = c("X1", "Name", "Overall", "Potential", "Value", "Wage", "Position", "Height", "Weight", "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes", "pagegroup", "fold")
ldacol1 <- c(1, 3:6, 8:45)

trainlda2 <- train1[ldacol1]
testlda2 <- test1[ldacol1]
set.seed(1234)
knn_models <- list()

ktrain <- trainlda[,-c(1,42)]
ktest <- testlda[,-c(1,42)]

for(i in 1:25) {
  knn_models[[i]] <- knn(ktrain, ktest, cl = trainlda$pagegroup, k = i)
}

knn_results <- lapply(knn_models, FUN = function(x) {
  return(confusion(x, testlda$pagegroup, quietly = TRUE)$stats)
}
)

knn_results <- bind_rows(knn_results)
knn_results$K <- 1:25

ggplot(knn_results, aes(x = specificity, y = sensitivity, label = K)) + geom_point() + geom_text(hjust = 2)

set.seed(1)
rf.fifa <- randomForest(pagegroup~., data = trainlda2[,-c(1,43)], mtry = 7, importance = TRUE, ntree = 500)

yhat.bag <- predict(rf.fifa, newdata = testlda2[,-c(1,43)])
confusion(yhat.bag, testlda2$pagegroup)
## yhat is the vector of predicted outcomes, possibly a factor.
##   
##  Sensitivity = (first level predicted) / (first level actual) 
##   
##  Specificity = (second level predicted) / (second level actual)
## $confusion_mat
##             y
## yhat         DEF middle DEF old DEF young GK middle GK old GK young
##   DEF middle        368      60        68         0      0        0
##   DEF old           100     409         0         0      0        0
##   DEF young          49       1       324         0      0        0
##   GK middle           0       0         0        77     18       12
##   GK old              0       0         0        28    133        1
##   GK young            0       0         0        21      1      116
##   MID middle         15       7         4         0      0        0
##   MID old             2      16         0         0      0        0
##   MID young           4       1        30         0      0        0
##   OFF middle          1       1         0         0      0        0
##   OFF old             0       1         0         0      0        0
##   OFF young           0       0         1         0      0        0
##             y
## yhat         MID middle MID old MID young OFF middle OFF old OFF young
##   DEF middle         45       5        16          3       0         0
##   DEF old            21      70         0          0       0         0
##   DEF young           3       0        30          0       0         3
##   GK middle           0       0         0          0       0         0
##   GK old              0       0         0          0       0         0
##   GK young            0       0         0          0       0         0
##   MID middle        156      39        23         62      19         7
##   MID old            10      62         0          7      25         1
##   MID young          36       3       205         13       0        61
##   OFF middle         37      10         7        165      67        32
##   OFF old            10      26         0         35     133         0
##   OFF young          13       2        52         27       1       223
## 
## $stats
##   sensitivity specificity
## 1   0.6827458   0.8245968
varImpPlot(rf.fifa)

league <- read_csv("league - league.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Warning: Duplicated column names deduplicated: 'X1' => 'X1_1' [2]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Name = col_character(),
##   Club = col_character(),
##   League = col_character()
## )
## See spec(...) for full column specifications.
league <- na.omit(league)

league3 <- league %>% mutate(perf = ifelse(Whoscored <= 6, "poor", ifelse(Whoscored >= 6.01 & Whoscored <= 6.50, "below", ifelse(Whoscored >= 6.51 & Whoscored <= 7.00, "average", ifelse(Whoscored >= 7.01 & Whoscored <= 7.5, "above", "excellent")))))

colpcal <- c(2:3, 8:10, 47)

league2 <- league[,-colpcal]

league3 <- league3[,-colpcal]

pca_league <- prcomp(league3 %>% dplyr::select(-X1, -Whoscored, -perf), scale = TRUE)

screeplot(pca_league)

pca_scoresleague <- pca_league$x

ldrleague <- pca_scoresleague %>%
  data.frame() %>%
  mutate(X1 = league2$X1, perf = league3$perf) %>%
  dplyr::select(X1, perf, everything())

colors3 <- c("blue", "red", "green", "yellow", "purple")
colors3 <- colors3[as.factor(ldrleague$perf)]
s6d <- scatterplot3d(ldrleague[,c(3,4,5)], pch = 1,  color=colors3)

#legend("right", legend = levels(as.factor(ldrleague$perf)), col = colors3, pch = 16)

ggplot(ldrleague, aes(x = PC1, y = PC2, color = ldrleague$perf)) +geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +geom_text(aes(label = X1), size = 2) +scale_x_continuous(breaks = -10:10) +coord_cartesian(xlim = c(-15, 7)) +theme_light()

set.seed(1234)
league3 <- subset(league3, select = -c(1,42))
train_size <- floor(0.65 * nrow (league3))
subset <- sample(seq_len(nrow(league3)), size = train_size)
train_league <- league3[subset,]
test_league <- league3[-subset,]
lm.mod <- lm(Whoscored~., data = train_league)
MSPE = mean((test_league$Whoscored - predict.lm(lm.mod, test_league)) ^ 2)
MSPE
## [1] 0.08237169
train_y <- predict(lm.mod, test_league)
plot(lm.mod)

#Ridge
ytr <- league3$Whoscored
xtr <- model.matrix(Whoscored~., league3)[,-1]

cvRidge.out <- cv.glmnet(xtr,ytr,alpha=0,nfolds=5)
plot(cvRidge.out)

mse.minridge <- min(cvRidge.out$cvm)
mse.minridge
## [1] 0.07795221
cat("CV Errors", cvRidge.out$cvm,fill=TRUE)
## CV Errors 0.111561 0.1110418 0.1109166 0.1108502 0.1107778 0.1106989 
## 0.110613 0.1105194 0.1104176 0.1103069 0.1101868 0.1100563 0.109915 
## 0.1097618 0.1095961 0.109417 0.1092237 0.1090153 0.1087909 0.1085497 
## 0.1082907 0.1080132 0.1077161 0.1073987 0.1070602 0.1066997 0.1063167 
## 0.1059103 0.1054801 0.1050255 0.1045461 0.1040418 0.1035122 0.1029574 
## 0.1023775 0.1017729 0.1011442 0.1004919 0.09981642 0.09912052 0.09840517 
## 0.09767211 0.09692344 0.09616156 0.09538914 0.09460906 0.09382443 
## 0.09303854 0.0922548 0.09147671 0.0907078 0.08995154 0.08921132 0.08849029 
## 0.08779134 0.087117 0.08646944 0.0858504 0.08526122 0.08470293 0.08417668 
## 0.08368156 0.08321718 0.08278326 0.08237933 0.0820042 0.08165664 
## 0.08133527 0.0810386 0.08076506 0.0805131 0.08028139 0.08006769 0.07987125 
## 0.07969052 0.07952426 0.07937127 0.07923059 0.07910063 0.07898119 
## 0.07887102 0.07877011 0.07867682 0.07859132 0.078513 0.07844158 0.07837598 
## 0.07831609 0.07826204 0.07821293 0.07816863 0.07812932 0.078094 0.07806312 
## 0.07803554 0.07801188 0.07799149 0.07797433 0.07795997 0.07795221
cat("Lambda with smallest CV Error",
cvRidge.out$lambda[which.min(cvRidge.out$cvm)],fill=TRUE)
## Lambda with smallest CV Error 0.01714158
cat("Coefficients", as.numeric(coef(cvRidge.out, s = "lambda.min")),fill=TRUE)
## Coefficients 3.549959 -0.001595308 0.01891555 0.009841154 0.007227757 
## 0.0003798079 0.0005687167 -0.0002035215 0.0004656983 -0.0008915139 
## -0.0001443171 -6.473863e-05 0.0003366044 0.00134886 -9.463043e-05 
## 0.0008776562 0.001776189 -0.0008266312 -0.001059408 0.003611604 
## -6.285275e-05 -0.001681204 -0.0007804197 0.003115542 0.0004154781 
## -0.0006916806 -0.0007262519 0.00104486 0.0003364664 -0.001030838 
## -0.0002385867 -0.0005185592 -5.907338e-05 3.870985e-05 0.000269259 
## 0.0006258105 0.0004933196 0.001502573 -0.0007679195 -0.00119968
cat("Number of Zero Coefficients",
sum(abs(coef(cvRidge.out))<1e-8),fill=TRUE)
## Number of Zero Coefficients 0
#Lasso
cvLasso.out <- cv.glmnet(xtr,ytr,alpha=1,nfolds=5)
plot(cvLasso.out)

mse.minlasso <- min(cvLasso.out$cvm)
mse.minlasso
## [1] 0.07817806
cat("CV Errors", cvLasso.out$cvm,fill=TRUE)
## CV Errors 0.1113083 0.1067624 0.102601 0.09914565 0.09627655 0.09389419 
## 0.09191595 0.09026808 0.08884736 0.08761073 0.08653092 0.08559207 
## 0.08479483 0.08412937 0.08350497 0.08284906 0.08226885 0.08175441 
## 0.0813173 0.08094293 0.08063297 0.08038348 0.0801821 0.080014 0.07985879 
## 0.0797227 0.07959319 0.07946063 0.07933221 0.07919785 0.07905756 0.0789187 
## 0.07880565 0.07871796 0.07864972 0.07859317 0.07854842 0.07851653 
## 0.07848239 0.07843392 0.07839096 0.07835601 0.07832325 0.07829488 
## 0.07827621 0.07826406 0.07825922 0.07824993 0.07823377 0.07823087 
## 0.07822476 0.07821548 0.07819967 0.07818264 0.07817806 0.07818283 
## 0.07819662 0.0782145 0.0782365 0.07826205 0.07828146 0.07830038 0.07831751 
## 0.07834069 0.07836349 0.07838468 0.07840511 0.07842496 0.0784397 
## 0.07845981 0.07847184 0.07848991 0.07850071 0.07851724 0.0785262 
## 0.07853711 0.07855195 0.0785594 0.07856619 0.07857464 0.07858211 
## 0.07858922 0.07859556 0.07860544 0.07861099 0.078615 0.07861823 0.07862092
cat("Lambda with smallest CV Error",
cvLasso.out$lambda[which.min(cvLasso.out$cvm)],fill=TRUE)
## Lambda with smallest CV Error 0.001127801
cat("Number of Zero Coefficients",sum(abs(coef(cvLasso.out))<1e-8),
fill=TRUE)
## Number of Zero Coefficients 35
coef(cvLasso.out)
## 40 x 1 sparse Matrix of class "dgCMatrix"
##                            1
## (Intercept)     4.7214454440
## Age             .           
## Overall         0.0197585707
## Potential       0.0051389722
## Height          .           
## Weight          .           
## Crossing        .           
## Finishing       .           
## HeadingAccuracy .           
## ShortPassing    .           
## Volleys         .           
## Dribbling       .           
## Curve           .           
## FKAccuracy      .           
## LongPassing     .           
## BallControl     .           
## Acceleration    .           
## SprintSpeed     .           
## Agility         .           
## Reactions       .           
## Balance         .           
## ShotPower       .           
## Jumping         .           
## Stamina         0.0001865443
## Strength        .           
## LongShots       .           
## Aggression      .           
## Interceptions   0.0002555665
## Positioning     .           
## Vision          .           
## Penalties       .           
## Composure       .           
## Marking         .           
## StandingTackle  .           
## SlidingTackle   .           
## GKDiving        .           
## GKHandling      .           
## GKKicking       .           
## GKPositioning   .           
## GKReflexes      .
pcrout <- pcr(Whoscored~., data = train_league, scale=TRUE, validation="CV")
x_train = model.matrix(Whoscored~., train_league)[,-1]
x_test = model.matrix(Whoscored~., test_league)[,-1]

y_train = train_league[,1]

y_test = as.matrix(test_league[,1])

pcr_pred = predict(pcrout, x_test, ncomp=6)
pcr_pred1 = as.matrix(pcr_pred)
mean((pcr_pred1-y_test)^2)
## [1] 0.08442751
validationplot(pcrout,val.type="MSEP")

plsout <- plsr(Whoscored~., data = train_league, scale=TRUE, validation="CV")
x_train = model.matrix(Whoscored~., train_league)[,-1]
x_test = model.matrix(Whoscored~., test_league)[,-1]

y_train = train_league[,1]

y_test = as.matrix(test_league[,1])

pls_pred = predict(plsout, x_test, ncomp=6)
pls_pred1 = as.matrix(pls_pred)
mean((pls_pred1-y_test)^2)
## [1] 0.08240741
validationplot(plsout, val.type="MSEP")

#Elastic Net
set.seed(1234)
cv_5 = trainControl(method = "cv", number = 5)
cvElastic.out <- train(Whoscored~., data = league3, method = "glmnet",trControl = cv_5)
cvElastic.out
## glmnet 
## 
## 2283 samples
##   39 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1826, 1827, 1825, 1828, 1826 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        RMSE       Rsquared   MAE      
##   0.10   0.0003428315  0.2813807  0.2908060  0.2176521
##   0.10   0.0034283153  0.2807828  0.2932995  0.2173039
##   0.10   0.0342831525  0.2805064  0.2952396  0.2176827
##   0.55   0.0003428315  0.2812248  0.2914896  0.2175642
##   0.55   0.0034283153  0.2803356  0.2952431  0.2172293
##   0.55   0.0342831525  0.2830744  0.2871470  0.2204476
##   1.00   0.0003428315  0.2810755  0.2921273  0.2174833
##   1.00   0.0034283153  0.2800792  0.2965043  0.2171685
##   1.00   0.0342831525  0.2856996  0.2805554  0.2229247
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda
##  = 0.003428315.
plot(cvElastic.out)